\n \n\n

\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n

<\/td>\n	// Learn more about F# at http://fsharp.net<\/td>\n <\/tr>\n
<\/td>\n	// See the 'F# Tutorial' project for more help.<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	open System.IO<\/td>\n <\/tr>\n
<\/td>\n	open SevenZip<\/td>\n <\/tr>\n
<\/td>\n	open System<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	SevenZipCompressor.SetLibraryPath(@"path/to/7z.dll")<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let dir = @"files/path/..."<\/td>\n <\/tr>\n
<\/td>\n	let txts = Directory.GetFiles(dir + @"Test")<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let sz = SevenZip.SevenZipCompressor()<\/td>\n <\/tr>\n
<\/td>\n	let st = System.Diagnostics.Stopwatch()<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	sz.CompressionMethod <- CompressionMethod.Ppmd<\/td>\n <\/tr>\n
<\/td>\n	sz.CompressionLevel <- CompressionLevel.Low<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let compress2 (f:byte[]) = <\/td>\n <\/tr>\n
<\/td>\n	use mio = new MemoryStream(f)<\/td>\n <\/tr>\n
<\/td>\n	use m2 = new MemoryStream(f.Length * 2) <\/td>\n <\/tr>\n
<\/td>\n	sz.CompressStream(mio, m2)<\/td>\n <\/tr>\n
<\/td>\n	m2.GetBuffer().[0..int m2.Length - 1]<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	st.Start()<\/td>\n <\/tr>\n
<\/td>\n	let compressionMap = txts \|> Array.map (fun f -> Path.GetFileNameWithoutExtension f, f \|> File.ReadAllBytes \|> compress2) \|> Map.ofArray<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let compredist f1 f2 n1 n2 =<\/td>\n <\/tr>\n
<\/td>\n	let code = compressionMap.[n1]<\/td>\n <\/tr>\n
<\/td>\n	let code2 = compressionMap.[n2]<\/td>\n <\/tr>\n
<\/td>\n	let fxy = Array.append f1 f2<\/td>\n <\/tr>\n
<\/td>\n	let code3 = compress2 (fxy) <\/td>\n <\/tr>\n
<\/td>\n	float(code3.Length - (min (code.Length) (code2.Length))) / float(max (code.Length) (code2.Length)) <\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let nearEdges = [\| for f in txts -> let n1 = Path.GetFileNameWithoutExtension f <\/td>\n <\/tr>\n
<\/td>\n	n1, <\/td>\n <\/tr>\n
<\/td>\n	let fbytes = File.ReadAllBytes f <\/td>\n <\/tr>\n
<\/td>\n	txts \|> Array.map (fun fname -> <\/td>\n <\/tr>\n
<\/td>\n	let n2 = Path.GetFileNameWithoutExtension fname<\/td>\n <\/tr>\n
<\/td>\n	Path.GetFileNameWithoutExtension fname, compredist fbytes (File.ReadAllBytes(fname)) n1 n2) <\/td>\n <\/tr>\n
<\/td>\n	\|> Array.sortBy snd\|]<\/td>\n <\/tr>\n
<\/td>\n	st.Stop() <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let nearEdgesMap = nearEdges \|> Map.ofArray<\/td>\n <\/tr>\n
<\/td>\n	let pairs = nearEdgesMap \|> Map.map (fun _ v -> Map.ofArray v)<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	printfn "%A" nearEdges <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	type 'a Tree =<\/td>\n <\/tr>\n
<\/td>\n	\| Node of 'a<\/td>\n <\/tr>\n
<\/td>\n	\| Branch of 'a Tree * 'a Tree<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	type Cluster<'a when 'a : comparison> = <\/td>\n <\/tr>\n
<\/td>\n	\| Singleton of 'a Set<\/td>\n <\/tr>\n
<\/td>\n	\| Clusters of 'a Set * 'a Tree<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let completelinkage (ps:Map<'a, Map<'a,float>>) (a: 'a Set) (b:'a Set) = <\/td>\n <\/tr>\n
<\/td>\n	a \|> Set.map (fun item1 -> b \|> Set.map (fun item2 -> ps.[item1].[item2]) <\/td>\n <\/tr>\n
<\/td>\n	\|> Set.maxElement) //we only want the two largest pair distances) <\/td>\n <\/tr>\n
<\/td>\n	\|> Set.maxElement<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let distclust ps = function <\/td>\n <\/tr>\n
<\/td>\n	\| Singleton (item), Clusters(items, _) -> completelinkage ps item items<\/td>\n <\/tr>\n
<\/td>\n	\| Clusters (items, _) , Singleton(item) -> completelinkage ps items item<\/td>\n <\/tr>\n
<\/td>\n	\| Clusters (items1, _), Clusters(items2, _) -> completelinkage ps items1 items2<\/td>\n <\/tr>\n
<\/td>\n	\| Singleton (item1) , Singleton(item2) -> ps.[item1.MaximumElement].[item2.MaximumElement]<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let mergeClusters = function <\/td>\n <\/tr>\n
<\/td>\n	\| Singleton (item), Clusters(items, dendogram) <\/td>\n <\/tr>\n
<\/td>\n	\| Clusters (items, dendogram) , Singleton(item) -> Clusters(Set.union item items, Branch(dendogram, Node item.MinimumElement))<\/td>\n <\/tr>\n
<\/td>\n	\| Clusters (items1, dendogram1), Clusters(items2, dendogram2) -> Clusters(Set.union items1 items2, Branch(dendogram1, dendogram2))<\/td>\n <\/tr>\n
<\/td>\n	\| Singleton (item1) , Singleton(item2) -> Clusters(Set.union item1 item2, Branch(Node item1.MinimumElement, Node item2.MinimumElement))<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let r = Random()<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	(* <\/td>\n <\/tr>\n
<\/td>\n	A function that takes a cluster and a set of clusters and finds the nearest item using cluster dist functions<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	A function that takes a cluster and an item and calculates distance as maxdist (item, clustermember)<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	There is a map that holds every item and its neighbiors<\/td>\n <\/tr>\n
<\/td>\n	If we have an item we find the closest item by looking it up in the map.<\/td>\n <\/tr>\n
<\/td>\n	But we also need to find the closest in the cluster. So we must compare the item to a cluster<\/td>\n <\/tr>\n
<\/td>\n	To do this we for each cluster, compare the distance to our current item<\/td>\n <\/tr>\n
<\/td>\n	If an item is closest we add the merged 2 to the cluster stack as a branch and remove the item from actives<\/td>\n <\/tr>\n
<\/td>\n	If a cluster is closest we merge the item to the tree, remove it from the cluster stack and add the new tree to the stack<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	If the next item we are looking at is a cluster we must find the closest item.<\/td>\n <\/tr>\n
<\/td>\n	To find it in the single set we map each item to its distance from the cluster using dist clust<\/td>\n <\/tr>\n
<\/td>\n	We also sort the cluster set by distance from current cluster<\/td>\n <\/tr>\n
<\/td>\n	Again if the single item is the closest we merge with cluster and remove from map;<\/td>\n <\/tr>\n
<\/td>\n	If the cluster is the closest we remove both clusters from clusterset, merge them and put them back<\/td>\n <\/tr>\n
<\/td>\n	Recurse <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	Item , Item -> Pack as a Singleton<\/td>\n <\/tr>\n
<\/td>\n	*)<\/td>\n <\/tr>\n
<\/td>\n	// (clusterset : Map<string, string Cluster>) <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let asCluster x = Singleton (set [x])<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let closestinActives distances cluster (item : string Set) = <\/td>\n <\/tr>\n
<\/td>\n	item \|> Set.map (fun s -> distclust distances (asCluster s, cluster), s)<\/td>\n <\/tr>\n
<\/td>\n	\|> Set.minElement <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let find points first closest (distances : Map<string, Map<string,float>>) = <\/td>\n <\/tr>\n
<\/td>\n	let initialActives = points \|> Set.ofArray \|> Set.remove first <\/td>\n <\/tr>\n
<\/td>\n	\|> Set.remove closest <\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let rec seek (stack : string Cluster list) (actives : string Set) = <\/td>\n <\/tr>\n
<\/td>\n	let current = stack.Head <\/td>\n <\/tr>\n
<\/td>\n	if stack.Length = 1 && actives = Set.empty then current<\/td>\n <\/tr>\n
<\/td>\n	else <\/td>\n <\/tr>\n
<\/td>\n	let nextDist, next = if actives.Count = 0 then Double.MaxValue,"" else closestinActives distances current actives<\/td>\n <\/tr>\n
<\/td>\n	if stack.Length = 1 then seek (asCluster next :: stack) (actives.Remove(next))<\/td>\n <\/tr>\n
<\/td>\n	else let topofstack = stack.Tail.Head <\/td>\n <\/tr>\n
<\/td>\n	let stackDist = distclust distances (topofstack, current) <\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	if nextDist < stackDist then<\/td>\n <\/tr>\n
<\/td>\n	seek (asCluster next :: stack) (actives.Remove(next))<\/td>\n <\/tr>\n
<\/td>\n	else seek ((mergeClusters (current, topofstack)) :: (stack.Tail.Tail)) actives<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	seek [asCluster closest ; asCluster first] initialActives<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let rec toGraph depth = function<\/td>\n <\/tr>\n
<\/td>\n	\| Node(x) -> x, "", " node\\r\\n [\\r\\n id\\t\\""+x+"\\"\\r\\n label\\t\\"" + x + "\\"\\r\\n ]\\r\\n"<\/td>\n <\/tr>\n
<\/td>\n	\| Branch(ltree,rtree) -> let lname, lgraph, names1 = toGraph (depth + 1) ltree<\/td>\n <\/tr>\n
<\/td>\n	let rname, rgraph, names2 = toGraph (depth + 1) rtree<\/td>\n <\/tr>\n
<\/td>\n	let name = string (r.Next(0, int(2. ** (float depth + 9.))) )<\/td>\n <\/tr>\n
<\/td>\n	name, sprintf "%s\\r\\n%s\\r\\n edge\\r\\n [\\r\\n source\\t\\"%s\\"\\r\\n target\\t\\"%s\\"\\r\\n ]\\r\\n edge\\r\\n [\\r\\n source\\t\\"%s\\"\\r\\n target\\t\\"%s\\"\\r\\n ]" <\/td>\n <\/tr>\n
<\/td>\n	lgraph rgraph name lname name rname, <\/td>\n <\/tr>\n
<\/td>\n	(sprintf " node\\r\\n [\\r\\n id\\t\\"%s\\"\\r\\n label\\t\\"\\"\\r\\n ]\\r\\n" name) + names1 + names2<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let first = fst nearEdges.[r.Next(0,txts.Length)]<\/td>\n <\/tr>\n
<\/td>\n	let closest = fst nearEdgesMap.[first].[1]<\/td>\n <\/tr>\n
<\/td>\n	let items, fcluster = (function \| Clusters(leset, letree) -> leset, letree) (find (nearEdges \|> Array.map fst) first closest pairs)<\/td>\n <\/tr>\n
<\/td>\n	\n<\/td>\n <\/tr>\n
<\/td>\n	let _, outgraph, nodes = toGraph 0 fcluster <\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n
<\/td>\n	let n = "graph [" + nodes + outgraph + "]"<\/td>\n <\/tr>\n
<\/td>\n	File.WriteAllText("mbook.gml", n)<\/td>\n <\/tr>\n
<\/td>\n	<\/td>\n <\/tr>\n<\/table>\n\n\n <\/div>\n\n <\/div>\n \n<\/div>\n\n <\/div>\n \n view raw<\/a>\n Nearest Compression Distance<\/a>\n hosted with ❤ by GitHub<\/a>\n <\/div>\n <\/div>\n<\/div>\n')